
The response tabel is built only on train dataset. For a category which is not there in train data and present in test data, we will encode them with default values Ex: in our test data if have State: D then we encode it as [0.5, 0.05]
with X-axis as n_estimators, Y-axis as max_depth, and Z-axis as AUC Score , we have given the notebook which explains how to plot this 3d plot, you can find it in the same drive 3d_scatter_plot.ipynbor
seaborn heat maps with rows as n_estimators, columns as max_depth, and values inside the cell representing AUC Score 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import nltk
# nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
sample_sentence_1='I am happy.'
ss_1 = sid.polarity_scores(sample_sentence_1)
print('sentiment score for sentence 1',ss_1)
sample_sentence_2='I am sad.'
ss_2 = sid.polarity_scores(sample_sentence_2)
print('sentiment score for sentence 2',ss_2)
sample_sentence_3='I am going to New Delhi tommorow.'
ss_3 = sid.polarity_scores(sample_sentence_3)
print('sentiment score for sentence 3',ss_3)
sentiment score for sentence 1 {'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719}
sentiment score for sentence 2 {'neg': 0.756, 'neu': 0.244, 'pos': 0.0, 'compound': -0.4767}
sentiment score for sentence 3 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
#Modules used
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
projecttitle=pd.read_csv('train_data.csv',nrows=35000)#for getting projectitle feature
data = pd.read_csv('preprocessed_data.csv',nrows=35000)
data['project_title']=projecttitle.project_title
X_train, X_test, y_train, y_test = train_test_split(data,data['project_is_approved'], test_size=0.33, random_state=42)
#In this train,test split i need values for response coding, i will drop y values once my i finish my response coding for categorical data
print(data.columns)
Index(['school_state', 'teacher_prefix', 'project_grade_category',
'teacher_number_of_previously_posted_projects', 'project_is_approved',
'clean_categories', 'clean_subcategories', 'essay', 'price',
'project_title'],
dtype='object')
def cat_encoding_train(df_train,df_test):
L=df_train.columns
t=(df_train[df_train.columns[0]].unique())
temp=[]
for k in t:
no=df_train[(df_train[L[0]]==k)&(df_train[L[1]]==0)].shape[0]
yes=df_train[(df_train[L[0]]==k)&(df_train[L[1]]==1)].shape[0]
no=no/(yes+no)
yes=yes/(yes+no)
temp.append([k,yes,no])
new_df=pd.DataFrame(temp,columns=[L[0],'State_1_'+L[0],'state_0_'+L[0]])
new_df_train=df_train.merge(new_df,on=L[0],how='left')
new_df_test=df_test.merge(new_df,on=L[0],how='left')
# print(new_df_test.isna().sum())
new_df_test.fillna(0.5,inplace=True)# If any empty point found in the test fill it with 0.5 equal probability
new_df_train.drop([L[0]],axis=1,inplace=True)
new_df_test.drop([L[0]],axis=1,inplace=True)
new_df_train.drop([L[1]],axis=1,inplace=True)
new_df_test.drop([L[1]],axis=1,inplace=True)
return new_df_train,new_df_test
#Converting the all category data to response coding
school_state_train,School_state_test=cat_encoding_train(X_train[['school_state','project_is_approved']],X_test[['school_state','project_is_approved']])
teaher_prefix_train,teacher_prefix_test=cat_encoding_train(X_train[['teacher_prefix','project_is_approved']],X_test[['teacher_prefix','project_is_approved']])
project_grade_category_train,project_grade_category_test=cat_encoding_train(X_train[['project_grade_category','project_is_approved']],X_test[['project_grade_category','project_is_approved']])
clean_categories_train,clean_categories_test=cat_encoding_train(X_train[['clean_categories','project_is_approved']],X_test[['clean_categories','project_is_approved']])
clean_subcategories_train,clean_subcategories_test=cat_encoding_train(X_train[['clean_subcategories','project_is_approved']],X_test[['clean_subcategories','project_is_approved']])
#droping project_is_approved y value from train and test data
X_train=X_train.drop('project_is_approved',axis=1)
X_test=X_test.drop('project_is_approved',axis=1)
#Normalizating the price column
standard_vector1=Normalizer()
standard_vector1.fit(X_train.price.values.reshape(-1,1))
X_train_price_norm=standard_vector1.transform(X_train.price.values.reshape(-1,1))
X_test_price_norm=standard_vector1.transform(X_test.price.values.reshape(-1,1))
#Normalizating the teacher_number_of_previously_posted_projects column
standard_vector1=Normalizer()
standard_vector1.fit(X_train.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
X_train_teacher_number_of_previously_posted_projects_norm=standard_vector1.transform(X_train.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
X_test_teacher_number_of_previously_posted_projects_norm=standard_vector1.transform(X_test.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
#sentiment score for essay
X_senti_train=X_train.essay.map(lambda x:list(sid.polarity_scores(x).values()))
essay_sentiment_train=pd.DataFrame(columns=['neg','neu','pos','compound'])
for i in X_senti_train:
d={'neg':i[0],'neu':i[1],'pos':i[2],'compound':i[3]}
essay_sentiment_train=essay_sentiment_train.append(d,ignore_index=True)
X_senti_test=X_test.essay.map(lambda x:list(sid.polarity_scores(x).values()))
essay_sentiment_test=pd.DataFrame(columns=['neg','neu','pos','compound'])
for i in X_senti_test:
d={'neg':i[0],'neu':i[1],'pos':i[2],'compound':i[3]}
essay_sentiment_test=essay_sentiment_test.append(d,ignore_index=True)
## 3. perform tfidf vectorization of text data.
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train['essay'].values)
X_train_essay_Tfidf = vectorizer.transform(X_train['essay'].values).todense()
X_test_essay_Tfidf=vectorizer.transform(X_test['essay'].values).todense()
## 3. perform tfidf vectorization of text data.
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train['project_title'].values)
X_train_project_title_Tfidf = vectorizer.transform(X_train['project_title'].values).todense()
X_test_project_title_Tfidf=vectorizer.transform(X_test['project_title'].values).todense()
#please use below code to load glove vectors
import pickle
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
def TFIDF_W2V(preprocessed_essays):
# average Word2Vec
# compute average word2vec for each review.
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
avg_w2v_vectors.append(vector)
print(len(avg_w2v_vectors))
print(len(avg_w2v_vectors[0]))
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
# average Word2Vec
# compute average word2vec for each review.
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors.append(vector)
tfidf_w2v_vectors=tfidf_w2v_vectors
return tfidf_w2v_vectors
tfidf_w2v_vectors=TFIDF_W2V(X_train.essay)
100%|██████████████████████████████████████████████████████████████████████████| 23450/23450 [00:05<00:00, 3985.42it/s]
23450 300
100%|███████████████████████████████████████████████████████████████████████████| 23450/23450 [00:46<00:00, 499.96it/s]
tfidf_w2v_vectors_test=TFIDF_W2V(X_test.essay)
100%|██████████████████████████████████████████████████████████████████████████| 11550/11550 [00:02<00:00, 3941.43it/s]
11550 300
100%|███████████████████████████████████████████████████████████████████████████| 11550/11550 [00:23<00:00, 493.70it/s]
tfidf_w2v_vectors_project_title=TFIDF_W2V(X_train.project_title)
100%|████████████████████████████████████████████████████████████████████████| 23450/23450 [00:00<00:00, 166063.23it/s] 0%| | 0/23450 [00:00<?, ?it/s]
23450 300
100%|████████████████████████████████████████████████████████████████████████| 23450/23450 [00:00<00:00, 106376.47it/s]
tfidf_w2v_vectors_test_project_title=TFIDF_W2V(X_test.project_title)
100%|████████████████████████████████████████████████████████████████████████| 11550/11550 [00:00<00:00, 163111.82it/s] 100%|████████████████████████████████████████████████████████████████████████| 11550/11550 [00:00<00:00, 103398.83it/s]
11550 300
#Forming_dataset1
dataset1_train=np.hstack((school_state_train,teaher_prefix_train,project_grade_category_train,X_train_project_title_Tfidf,clean_categories_train,clean_subcategories_train,X_train_price_norm,X_train_teacher_number_of_previously_posted_projects_norm,essay_sentiment_train,X_train_essay_Tfidf))
print(dataset1_train.shape)
(23450, 7815)
dataset1_test=np.hstack((School_state_test,teacher_prefix_test,project_grade_category_test,X_test_project_title_Tfidf,clean_categories_test,clean_subcategories_test,
X_test_price_norm,X_test_teacher_number_of_previously_posted_projects_norm,essay_sentiment_test,X_test_essay_Tfidf))
print(dataset1_test.shape)
(11550, 7815)
dataset2_train=np.hstack((school_state_train,teaher_prefix_train,project_grade_category_train,clean_categories_train,clean_subcategories_train,
X_train_price_norm,X_train_teacher_number_of_previously_posted_projects_norm,tfidf_w2v_vectors,tfidf_w2v_vectors_project_title
))
print(dataset2_train.shape)
(23450, 612)
dataset2_test=np.hstack((School_state_test,teacher_prefix_test,project_grade_category_test,clean_categories_test,clean_subcategories_test,
X_test_price_norm,X_test_teacher_number_of_previously_posted_projects_norm,tfidf_w2v_vectors_test,tfidf_w2v_vectors_test_project_title))
print(dataset2_test.shape)
(11550, 612)
Apply GBDT on different kind of featurization as mentioned in the instructions
For Every model that you work on make sure you do the step 2 and step 3 of instrucations
# please write all the code with proper documentation, and proper titles for each subsection
# go through documentations and blogs before you start coding
# first figure out what to do, and then think about how to do.
# reading and understanding error messages will be very much helpfull in debugging your code
# when you plot any graph make sure you use
# a. Title, that describes your plot, this will be very helpful to the reader
# b. Legends if needed
# c. X-axis label
# d. Y-axis label
n_estimators=[1,10,20,30]#List of n_estimators used to find hyper parameter
max_depth=[1,2,5,10]#list of max_depth used to find the hyper paremeter
def find_AUC(n_estimators,depth,x,y,X_test,Y_test):
AUC_score=[]
count=0
for i in n_estimators:
for j in depth:
clf_model1=GradientBoostingClassifier(n_estimators=i, learning_rate=1.0,max_depth=j, random_state=0)
clf_model1.fit(x,y)
y_pred=clf_model1.predict(X_test)
#Y_test_true=y[y==1]
#X_test_true=x[x==1]
count=count+1
print(count,end=' ')
AUC_score.append([i,j,roc_auc_score(Y_test,clf_model1.predict_proba(X_test)[:,1])])
print(AUC_score)
return AUC_score
auc_values_dataset1=find_AUC(n_estimators,max_depth,dataset1_train,y_train,dataset1_test,y_test)
print(auc_values_dataset1)
1 [[1, 1, 0.5281398196197392]] 2 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237]] 3 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969]] 4 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063]] 5 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208]] 6 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867]] 7 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647]] 8 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839]] 9 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027]] 10 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739]] 11 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286]] 12 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212]] 13 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332]] 14 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332], [30, 2, 0.6468615947460434]] 15 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332], [30, 2, 0.6468615947460434], [30, 5, 0.63497255716565]] 16 [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332], [30, 2, 0.6468615947460434], [30, 5, 0.63497255716565], [30, 10, 0.6102805977567951]] [[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332], [30, 2, 0.6468615947460434], [30, 5, 0.63497255716565], [30, 10, 0.6102805977567951]]
tempt1=[[1, 1, 0.5281398196197392], [1, 2, 0.5501157533509237], [1, 5, 0.5760326983633969], [1, 10, 0.5729110438114063], [10, 1, 0.6120869069100208], [10, 2, 0.6313369720417867], [10, 5, 0.6324219398666647], [10, 10, 0.6200982175798839], [20, 1, 0.625649921029027], [20, 2, 0.6428629151754739], [20, 5, 0.634125149587286], [20, 10, 0.6154291847891212], [30, 1, 0.6416208448608332], [30, 2, 0.6468615947460434], [30, 5, 0.63497255716565], [30, 10, 0.6102805977567951]]
auc_values_dataset2=find_AUC(n_estimators,max_depth,dataset2_train,y_train,dataset2_test,y_test)
print(auc_values_dataset2)
1 [[1, 1, 0.5281398196197392]] 2 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992]] 3 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562]] 4 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604]] 5 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355]] 6 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477]] 7 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268]] 8 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339]] 9 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636]] 10 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288]] 11 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777]] 12 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453]] 13 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368]] 14 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368], [30, 2, 0.6577192833005108]] 15 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368], [30, 2, 0.6577192833005108], [30, 5, 0.6247606884228145]] 16 [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368], [30, 2, 0.6577192833005108], [30, 5, 0.6247606884228145], [30, 10, 0.6007274066963436]] [[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368], [30, 2, 0.6577192833005108], [30, 5, 0.6247606884228145], [30, 10, 0.6007274066963436]]
temt2=[[1, 1, 0.5281398196197392], [1, 2, 0.5737852016725992], [1, 5, 0.6001477384128562], [1, 10, 0.5602244044278604], [10, 1, 0.6333244146018355], [10, 2, 0.6496037499817477], [10, 5, 0.6276404376547268], [10, 10, 0.580431804543339], [20, 1, 0.6449000748357636], [20, 2, 0.6583726109263288], [20, 5, 0.6220161094045777], [20, 10, 0.5929020474400453], [30, 1, 0.6574581946284368], [30, 2, 0.6577192833005108], [30, 5, 0.6247606884228145], [30, 10, 0.6007274066963436]]
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
import matplotlib.pyplot as plt
x1 = np.array(auc_values_dataset1)[:,0]
y1 = np.array(auc_values_dataset1)[:,1]
z1 = np.array(auc_values_dataset1)[:,2]
x2 = np.array(auc_values_dataset2)[:,0]
y2 = np.array(auc_values_dataset2)[:,1]
z2 = np.array(auc_values_dataset2)[:,2]
trace1 = go.Scatter3d(x=x1,y=y1,z=z1, name = 'train')
trace2 = go.Scatter3d(x=x2,y=y2,z=z2, name = 'Cross validation')
data = [trace1, trace2]
layout = go.Layout(scene = dict(
xaxis = dict(title='n_estimators'),
yaxis = dict(title='max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
fig.show()
plt.show()
def get_max_auc(ar):
AUC=0
min_sample=0
depth=0
for i in ar:
k,l,m= i
if AUC<m:
AUC=m
min_sample=k
depth=l
return min_sample,depth,AUC
n_es1,dep1,AUC1=get_max_auc(auc_values_dataset1)
print(f'For the first dataset N-estimator is {n_es1}, max_depth is {dep1} and Maximum AUC is {AUC1}')
n_es2,dep2,AUC2=get_max_auc(auc_values_dataset2)
print(f'For the second dataset N-estimator is {n_es2}, max_depth is {dep2} and Maximum AUC is {AUC2}')
For the first dataset N-estimator is 30, max_depth is 2 and Maximum AUC is 0.6468615947460434 For the second dataset N-estimator is 20, max_depth is 2 and Maximum AUC is 0.6583726109263288
#building the model with the best_hyper parameters
clf_model1=GradientBoostingClassifier(n_estimators=30, learning_rate=1.0,max_depth=2, random_state=0)
clf_model1.fit(dataset1_train,y_train)
y_pred_train_1=clf_model1.predict(dataset1_train)
y_pred_test_1=clf_model1.predict(dataset1_test)
new_proba_train=clf_model1.predict_proba(dataset1_train)
new_proba_test=clf_model1.predict_proba(dataset1_test)
from sklearn.metrics import roc_curve
fpr,tpr,th=roc_curve(y_train,new_proba_train[:,1])
fpr_t,tpr_t,th_t=roc_curve(y_test,new_proba_test[:,1])
def get_max(fpr,tpr,th):
temp=0
ind=0
for i,j in zip(fpr,tpr):
temp1=abs(j*(1-i))
if temp<temp1:
temp=temp1
req=th[ind]
ind=ind+1
return req
print(f'Best thershold value for train data is {get_max(fpr,tpr,th)}')
print(f'Best thershold value for test data is {get_max(fpr_t,tpr_t,th_t)}')
Best thershold value for train data is 0.8568787084260958 Best thershold value for train data is 0.8770090405696523
# Plot the ROC-AUC curves using the probability predictions made on train and test data.
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
plt.plot(fpr,tpr,label='train')
plt.plot(fpr_t,tpr_t,label='test')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_curve for dataset1')
plt.show()
print(f' For train data the Roc score is {roc_auc_score(y_train,y_pred_train_1)}')
print(f' For Test data the Roc score is {roc_auc_score(y_test,y_pred_test_1)}')
For train data the Roc score is 0.569003390065034 For Test data the Roc score is 0.5230311658825533
def print_confusion(x):
TN,FP,FN,TP=x
req_details=['','Predicted: NO','Predicted: YES']
from prettytable import PrettyTable
x = PrettyTable()
x.field_names=req_details
x.add_row(['Actual : NO',TN,FP])
x.add_row(['ACual : YES',FN,TP])
print(x)
y_pred_train = (clf_model1.predict_proba(dataset1_train)[:,1] >= get_max(fpr,tpr,th)).astype(bool)
y_pred_test = (clf_model1.predict_proba(dataset1_test)[:,1] >= get_max(fpr_t,tpr_t,th_t)).astype(bool)
con=confusion_matrix(y_train,y_pred_train)
con_2=confusion_matrix(y_test,y_pred_test)
print(f'Train confusin matrix \n ')
print_confusion(con.ravel())
print(f'Test confusin matrix \n')
print_confusion(con_2.ravel())
Train confusin matrix +-------------+---------------+----------------+ | | Predicted: NO | Predicted: YES | +-------------+---------------+----------------+ | Actual : NO | 2229 | 1421 | | ACual : YES | 5433 | 14367 | +-------------+---------------+----------------+ Test confusin matrix +-------------+---------------+----------------+ | | Predicted: NO | Predicted: YES | +-------------+---------------+----------------+ | Actual : NO | 791 | 930 | | ACual : YES | 2557 | 7272 | +-------------+---------------+----------------+
#building the model with the best_hyper parameters
clf_model2=GradientBoostingClassifier(n_estimators=30, learning_rate=1.0,max_depth=2, random_state=0)
clf_model2.fit(dataset2_train,y_train)
y_pred_train_2=clf_model2.predict(dataset2_train)
y_pred_test_2=clf_model2.predict(dataset2_test)
new_proba_train=clf_model2.predict_proba(dataset2_train)
new_proba_test=clf_model2.predict_proba(dataset2_test)
from sklearn.metrics import roc_curve
fpr,tpr,th=roc_curve(y_train,new_proba_train[:,1])
fpr_t,tpr_t,th_t=roc_curve(y_test,new_proba_test[:,1])
def get_max(fpr,tpr,th):
temp=0
ind=0
for i,j in zip(fpr,tpr):
temp1=abs(j*(1-i))
if temp<temp1:
temp=temp1
req=th[ind]
ind=ind+1
return req
print(f'Best thershold value for train data is {get_max(fpr,tpr,th)}')
print(f'Best thershold value for test data is {get_max(fpr_t,tpr_t,th_t)}')
Best thershold value for train data is 0.8464357356094186 Best thershold value for test data is 0.8390134709435757
# Plot the ROC-AUC curves using the probability predictions made on train and test data.
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
plt.plot(fpr,tpr,label='train')
plt.plot(fpr_t,tpr_t,label='test')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_curve for dataset2')
plt.show()
print(f' For train data the Roc score is {roc_auc_score(y_train,y_pred_train_2)}')
print(f' For Test data the Roc score is {roc_auc_score(y_test,y_pred_test_2)}')
For train data the Roc score is 0.5552290023522901 For Test data the Roc score is 0.5274625497518313
y_pred_train = (clf_model2.predict_proba(dataset2_train)[:,1] >= get_max(fpr,tpr,th)).astype(bool)
y_pred_test = (clf_model2.predict_proba(dataset2_test)[:,1] >= get_max(fpr_t,tpr_t,th_t)).astype(bool)
con=confusion_matrix(y_train,y_pred_train)
con_2=confusion_matrix(y_test,y_pred_test)
print(f'Train confusin matrix \n ')
print_confusion(con.ravel())
print(f'Test confusin matrix \n')
print_confusion(con_2.ravel())
Train confusin matrix +-------------+---------------+----------------+ | | Predicted: NO | Predicted: YES | +-------------+---------------+----------------+ | Actual : NO | 2476 | 1174 | | ACual : YES | 6183 | 13617 | +-------------+---------------+----------------+ Test confusin matrix +-------------+---------------+----------------+ | | Predicted: NO | Predicted: YES | +-------------+---------------+----------------+ | Actual : NO | 960 | 761 | | ACual : YES | 3091 | 6738 | +-------------+---------------+----------------+
as mentioned in the step 4 of instructions
req_details=['Vectorizer','Model','Hyperparameter-1','Hyperparameter-2','AUC']
n_es1,dep1,AUC1
from prettytable import PrettyTable
x = PrettyTable()
x.field_names=req_details
x.add_row(['TFIDF','Brute',n_es1,dep1,AUC1])
x.add_row(['TFIDF w2v','Brute',n_es2,dep2,AUC2])
print(x)
+------------+-------+------------------+------------------+--------------------+ | Vectorizer | Model | Hyperparameter-1 | Hyperparameter-2 | AUC | +------------+-------+------------------+------------------+--------------------+ | TFIDF | Brute | 30 | 2 | 0.6468615947460434 | | TFIDF w2v | Brute | 20 | 2 | 0.6583726109263288 | +------------+-------+------------------+------------------+--------------------+
#Summary
#For the first model we used TFIDF and response coding to get features
#We found the that n_estimator =30 and max_depth =2 to get the maximum AUC
#The Maximum AUc is 0.64 for model 1
#for the second model we used TFIDF w2v and response coding to get features
#We found the that n_estimator =20 and max_depth =2 to get the maximum AUC
#The Maximum AUc is 0.65 for model 2
# The biggest model Found in this model is it take a lot of time to train and it is easy to overfit the model